ISA DeepData
This notebook processes data from the International Seabed Authority (ISA) DeepData database into Darwin Core archives. The resulting datasets are hosted at https://datasets.obis.org/hosted/isa/index.html.
The code for this notebook is hosted at https://github.com/iobis/notebook-deepdata.
Data flow
Fetching the data from the ISA server
The DeepData dataset is delivered to OBIS via FTP. Let’s list the files on the FTP server and download the most recent JSON file to a temporary directory. Credentials are stored in env.txt.
require(RCurl)
library(stringr)
library(dplyr)
readRenviron("env.txt")
url <- Sys.getenv("deepdata_url")
res <- getURL(url, verbose = TRUE, ftp.use.epsv = FALSE, dirlistonly = TRUE)
filenames <- unlist(strsplit(res, "\\n"))
filenames <- sort(filenames[str_detect(filenames, ".json")], decreasing = TRUE)
if (length(filenames) != 1) {
stop("Unexpected number of files found")
}
file_url <- paste0(url, "/", filenames[1])
temp_file <- tempfile(pattern = "deepdata_", tmpdir = tempdir(), fileext = ".json")
download.file(file_url, temp_file)Parsing the JSON file
Earlier versions of the file were encoded in the non-standard ISO-8859-1, requiring the need to use readLines before parsing the data with the jsonlite package, but that is fixed now.
Generating Darwin Core data files
We can now extract a list of distinct datasets from the data frame, and generate a Darwin Core archive for each dataset. Let’s first generate dataset identifiers from the dataset titles for later use in the dataset URLs. To account for possible future changes to dataset titles, I’m removing or fixing some words in the titles. The result should be that identifiers do not change when typos are fixed in the future.
library(knitr)
titles <- records %>%
distinct(Metadata$title) %>%
pull("Metadata$title")
identifiers <- titles %>%
tolower(.) %>%
str_replace(., "meiofaun$", "meiofauna") %>%
str_replace(., "templaye", "template") %>%
str_replace(., "template", "") %>%
str_replace(., "biodiveristy", "biodiversity") %>%
str_replace(., "macrfaun$", "macrofauna") %>%
str_replace(., "meofauna", "meiofauna") %>%
str_replace(., "meiobent$", "meiobenthos") %>%
str_replace(., "-", " ") %>%
str_squish(.) %>%
str_replace_all(., "\\s", "_")
stopifnot(length(unique(titles)) == length(unique(identifiers)))
records$dataset_id <- identifiers[match(records$Metadata$title, titles)]
data.frame(titles, identifiers) %>%
kable()| titles | identifiers |
|---|---|
| BGRPMN12015 BIODIVERSITY | bgrpmn12015_biodiversity |
| BGRPMN12017 Biodiveristy | bgrpmn12017_biodiversity |
| BGRPMN12016 Biodiversity Envdata | bgrpmn12016_biodiversity_envdata |
| RUSMNRCRFC12015 Env Template Biodata | rusmnrcrfc12015_env_biodata |
| KOREAPMN12019 Macrofauna2018 | koreapmn12019_macrofauna2018 |
| BGRPMN12017 Env Template MANGAN2016 | bgrpmn12017_env_mangan2016 |
| BGRPMN12017 Env Template FLUM | bgrpmn12017_env_flum |
| GSRPMN12017 Env Template BIOSO239 | gsrpmn12017_env_bioso239 |
| DORDPMN12016 Mn2016 ENV | dordpmn12016_mn2016_env |
| DORDPMN12018 Env Mn Bio | dordpmn12018_env_mn_bio |
| IOMPMN12018 Env Template BIOL | iompmn12018_env_biol |
| BGRPMN12017 Env Template MANGAN2014 | bgrpmn12017_env_mangan2014 |
| KOREAPMN12011 Env Template 2011 abundance | koreapmn12011_env_2011_abundance |
| IOMPMN12015 Annex 1 env | iompmn12015_annex_1_env |
| COMRAPMS12017 Env Template zooplankton | comrapms12017_env_zooplankton |
| COMRAPMS12018 zooplankton | comrapms12018_zooplankton |
| YUZHPMN12015 Biodata B6 | yuzhpmn12015_biodata_b6 |
| YUZPMN12016 Biodata | yuzpmn12016_biodata |
| UKSRLPMN12015 Env Template Scavengers 032016 | uksrlpmn12015_env_scavengers_032016 |
| UKSRLPMN12016 NHM-UNI | uksrlpmn12016_nhm_uni |
| UKSRLPMN12016 Senkenberg | uksrlpmn12016_senkenberg |
| UKSRLPMN12016 Megafauna | uksrlpmn12016_megafauna |
| UKSRLPMN12017 NHM UNI | uksrlpmn12017_nhm_uni |
| UKSRLPMN12017 Senkenberg Macrofauna | uksrlpmn12017_senkenberg_macrofauna |
| GSRPMN12016 MarBiol UGent | gsrpmn12016_marbiol_ugent |
| IFREMERPMN12015 AR ENV | ifremerpmn12015_ar_env |
| COMRAPMN12018 Plankton2017 | comrapmn12018_plankton2017 |
| COMRAPMN12018 Lander2017 | comrapmn12018_lander2017 |
| COMRAPMS12017 Env Template phytoplankton | comrapms12017_env_phytoplankton |
| IFREMERPMN12018 Nodinaut Nematoda Copepoda | ifremerpmn12018_nodinaut_nematoda_copepoda |
| JOGMECCRFC12018 Env Template HK17-01 phyto | jogmeccrfc12018_env_hk17_01_phyto |
| JOGMECCRFC12018 Env Template JK18 picoplankton | jogmeccrfc12018_env_jk18_picoplankton |
| JOGMECCRFC12018 Env Template HK17 NEMA | jogmeccrfc12018_env_hk17_nema |
| COMRACRFC12016 Env Template DY37 megafauna | comracrfc12016_env_dy37_megafauna |
| COMRACRFC12015 Env Template Meiofaun | comracrfc12015_env_meiofauna |
| COMRACFRC120015 Env Template 2015 demersal scavenger | comracfrc120015_env_2015_demersal_scavenger |
| COMRACRFC12015 Env Template MacrFaun | comracrfc12015_env_macrofauna |
| COMRACFRC120015 Env Template 2015 macrofauna | comracfrc120015_env_2015_macrofauna |
| KOREAPMN12010 Env Template 2010 biomass | koreapmn12010_env_2010_biomass |
| KOREAPMN12013 Env Template 2013 abundance | koreapmn12013_env_2013_abundance |
| COMRACRFC12017 Env Template DY36 meiofauna | comracrfc12017_env_dy36_meiofauna |
| KOREAPMN12019 macrofauna2019 | koreapmn12019_macrofauna2019 |
| UKSRLPMN12017 NOC NERC | uksrlpmn12017_noc_nerc |
| OMSPMN12018 Env NHM NORCE | omspmn12018_env_nhm_norce |
| UKSRLPMN12016 AB02 NOCS | uksrlpmn12016_ab02_nocs |
| COMRAPMN12016 Sequences | comrapmn12016_sequences |
| COMRAPMS12015 ENV | comrapms12015_env |
| COMRAPMS12015 Env Template Meiobent | comrapms12015_env_meiobenthos |
| OMSPMN12018 NUS Data | omspmn12018_nus_data |
| COMRAPMS12018 Phytoplankton | comrapms12018_phytoplankton |
| IFREMERPMN12018 SO239 | ifremerpmn12018_so239 |
| UKSRLPMN12015 Env Template Macrofauna 032016 | uksrlpmn12015_env_macrofauna_032016 |
| UKSRLPMN12015 Env Template Senckenberg 032016 | uksrlpmn12015_env_senckenberg_032016 |
| UKSRLPMN12015 Env Template GG 032020163 | uksrlpmn12015_env_gg_032020163 |
| UKSRLPMN12015 Env Template AB01 NHM | uksrlpmn12015_env_ab01_nhm |
| UKSRLPMN12015 Env Templaye Megafauna 032016 | uksrlpmn12015_env_megafauna_032016 |
| UKSRLPMN12017 Senkenberg Meofauna | uksrlpmn12017_senkenberg_meiofauna |
| COMRAPMN12014 Env Template W1101 | comrapmn12014_env_w1101 |
| COMRAPMN12014 Env Template W1102 | comrapmn12014_env_w1102 |
| COMRAPMN12014 Env Template WS1102 | comrapmn12014_env_ws1102 |
| COMRAPMN12014 Env Template WS1104 | comrapmn12014_env_ws1104 |
| IFREMERPMN12017 Env Template BIO1 2017 | ifremerpmn12017_env_bio1_2017 |
| COMRACRFC12017 Env Template DY29 zooplankton | comracrfc12017_env_dy29_zooplankton |
Extracting occurrence data
Let’s first create a new ID column, this will be used later to link together the measurements and occurrences, and to select records by dataset. We cannot use occurrenceID here because these are not unique within the dataset.
library(uuid)
records$id <- UUIDgenerate(use.time = NA, n = nrow(records))
stopifnot(length(unique(records$id)) == nrow(records))Now we can select and process the columns that will go into the occurrence table.
Initial cleanup of occurrence data
First clean up any escaped newlines, empty strings, and placeholder values. Also fix basisOfRecord and convert coordinates to numeric values:
library(stringr)
occ <- occ %>%
mutate_all(~gsub("\\n", "", .)) %>%
mutate_all(~na_if(., "")) %>%
mutate(across(where(is.character), str_squish)) %>%
mutate_all(~replace(., . %in% c("indet", "Not Reported", "indet."), NA)) %>%
mutate(basisOfRecord = "HumanObservation") %>%
mutate(
decimalLongitude = as.numeric(decimalLongitude),
decimalLatitude = as.numeric(decimalLatitude)
)Let’s check for coordinates issues:
Let’s take a look at scientificName and scientificNameID.
occ %>%
group_by(scientificName) %>%
summarize(records = n()) %>%
arrange(desc(records)) %>%
rmarkdown::paged_table()occ %>%
group_by(scientificNameID) %>%
summarize(records = n()) %>%
arrange(desc(records)) %>%
rmarkdown::paged_table()So at least in the current version at the time of writing (June 2021) there are some quality issues for scientificName.
Fixing taxonomy
Let’s try to clean up the scientific names before we do taxon matching with WoRMS. Here I’m using the gni_parse() function from the taxize package, which connects to the GNI name parser. If a name cannot be parsed, I’m keeping the original.
The first step is to create a list of all distinct names in the taxonomy columns.
taxonomy <- occ %>%
select(phylum, class, order, family, genus, scientificName)
names <- na.omit(unique(unlist(taxonomy)))Then pass through the name parser:
library(taxize)
clean_name <- function(name) {
parsed <- tryCatch({
res <- gni_parse(name)
stopifnot(nrow(res) == 1)
return(res$canonical[1])
},
error = function(cond){
return(name)
})
}
names_clean <- sapply(names, clean_name) Now use the cleaned names for taxon matching:
library(worrms)
match_name <- function(name) {
lsid <- tryCatch({
res <- wm_records_names(name)
matches <- res[[1]] %>%
filter(match_type == "exact" | match_type == "exact_genus")
if (nrow(matches) > 1) {
message(paste0("Multiple exact matches for ", name))
}
return(matches$lsid[1])
}, error = function(cond) {
return(NA)
})
}
lsids <- sapply(names_clean, match_name)Now we need to find the lowest taxonomic level at which we find a name. Note that this will result in records with less taxonomic resolution than intended. Ideally we would only match on scientificName. First translate the taxonomy columns to LSIDs:
## # A tibble: 55,065 x 6
## phylum class order family genus scientificName
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 urn:lsid:mari… urn:lsid:mari… <NA> <NA> <NA> urn:lsid:marinespec…
## 2 urn:lsid:mari… urn:lsid:mari… urn:lsid:mar… <NA> <NA> urn:lsid:marinespec…
## 3 urn:lsid:mari… urn:lsid:mari… urn:lsid:mar… <NA> <NA> urn:lsid:marinespec…
## 4 urn:lsid:mari… urn:lsid:mari… urn:lsid:mar… <NA> <NA> urn:lsid:marinespec…
## 5 urn:lsid:mari… urn:lsid:mari… urn:lsid:mar… <NA> <NA> urn:lsid:marinespec…
## 6 urn:lsid:mari… <NA> <NA> <NA> <NA> urn:lsid:marinespec…
## 7 urn:lsid:mari… urn:lsid:mari… <NA> <NA> <NA> urn:lsid:marinespec…
## 8 urn:lsid:mari… urn:lsid:mari… <NA> <NA> <NA> urn:lsid:marinespec…
## 9 urn:lsid:mari… urn:lsid:mari… <NA> <NA> <NA> urn:lsid:marinespec…
## 10 <NA> <NA> <NA> <NA> <NA> <NA>
## # … with 55,055 more rows
The find the most specific one for each row:
taxonomy_clean <- taxonomy_clean %>%
mutate(best = coalesce(scientificName, genus, family, order, class))I’ll use the resulting LSIDs to replace the provided scientificNameIDs.
Let’s take another look at the top scientificName and scientificNameID after mathing:
occ %>%
group_by(scientificName, scientificNameID) %>%
summarize(records = n()) %>%
arrange(desc(records)) %>%
head(30) %>%
knitr::kable()Extracting MeasurementOrFact data
extract_mof <- function(df) {
df %>%
select("id", "dataset_id", "MeasurementOrFact") %>%
jsonlite::flatten() %>%
rename_all(~str_replace(., ".*\\.", "")) %>%
mutate(across(where(is.character), str_squish)) %>%
mutate_all(~na_if(., "")) %>%
filter(!is.na(measurementType) & !is.na(measurementValue)) %>%
as_tibble()
}
mof <- extract_mof(records)
mof## # A tibble: 12,218 x 6
## id dataset_id measurementID measurementType measurementValue
## <chr> <chr> <chr> <chr> <chr>
## 1 92646046-dd1b… bgrpmn12015_b… MUC20401639879 Relative abund… 11.9718309859155
## 2 bd432529-6d7c… bgrpmn12015_b… MUC20411639880 Relative abund… 5.63380281690141
## 3 6152e89b-e25f… bgrpmn12015_b… MUC20421639881 Relative abund… 15.4929577464789
## 4 eef28bf0-63e6… bgrpmn12015_b… MUC20431639882 Relative abund… 33.0985915492958
## 5 59a71595-6486… bgrpmn12015_b… MUC20441639883 Relative abund… 10.5633802816901
## 6 8ddf6f5e-b190… bgrpmn12015_b… MUC20451639884 Relative abund… 11.9718309859155
## 7 fb8555cb-a43d… bgrpmn12015_b… MUC20461639885 Relative abund… 5.63380281690141
## 8 def8eb13-2111… bgrpmn12015_b… MUC20471639886 Relative abund… 0.7042253521126…
## 9 9db90d2c-24e3… bgrpmn12015_b… MUC20481639887 Relative abund… 0.7042253521126…
## 10 cc51a6e5-bf59… bgrpmn12015_b… MUC20491639888 Relative abund… 2.8169014084507
## # … with 12,208 more rows, and 1 more variable: measurementUnit <chr>
A number of records appear to have empty values. To demonstrate this, let’s take a look at the most common combinations of measurementType and measurementValue:
mof %>%
group_by(measurementType, measurementValue) %>%
summarize(records = n()) %>%
arrange(desc(records)) %>%
head(10) %>%
knitr::kable()| measurementType | measurementValue | records |
|---|---|---|
| Relative abundance | ns | 4938 |
| Relative abundance | 0 | 404 |
| Relative abundance | in progress | 141 |
| Relative abundance | 0.09765625 | 78 |
| Relative abundance | 0.282485875706215 | 72 |
| Relative abundance | 0.122100122100122 | 58 |
| Relative abundance | 0.01 | 57 |
| Relative abundance | 0.25 | 56 |
| Relative abundance | 0.26525198938992 | 56 |
| Relative abundance | 0.03 | 51 |
Generating Darwin Core Archives
Generating EML
For demonstration purposes, I’m working with the dataset pertaining to the first record here. The EML template is read from templates/eml.xml:
library(readr)
library(glue)
generate_eml <- function(df) {
eml <- read_file("templates/eml.xml")
metadata <- df$Metadata[1,]
firstname <- strsplit(metadata$Creator$name, " ")[[1]][1]
lastname <- strsplit(metadata$Creator$name, " ")[[1]][2]
organization <- metadata$Creator$organisation
email <- metadata$Creator$email
position <- metadata$Creator$position
creator_firstname <- ""
creator_lastname <- ""
creator_organization <- metadata$Contact$organisation
creator_email <- ""
creator_position <- ""
abstract <- metadata$abstract
title <- metadata$title
citation <- metadata$citation
packageid <- "https://datasets.obis.org/deepdata"
pubdate <- format(Sys.time(), "%Y-%m-%d")
datestamp <- format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")
glue(eml)
}
generate_eml(records)## <eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1"
## xmlns:dc="http://purl.org/dc/terms/"
## xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
## xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.1/eml.xsd"
## packageId="https://datasets.obis.org/deepdata" system="http://gbif.org" scope="system"
## xml:lang="eng">
##
## <dataset>
## <title xml:lang="eng">BGRPMN12015 BIODIVERSITY</title>
## <pubDate>2021-06-05</pubDate>
## <language>eng</language>
## <abstract>
## <para>Sampling data captured in Oceanic Exploration Research Based mainly on nodule abundances in 221 box core samples and the hydro-acoustic data obtained during seven cruises to the German license area, resource modelling using artificial neural network statistics was carried out for the entire license area of 75,000 km2 size, and in particular for four prospective potential mining areas in the eastern license area with a total size of 1038 km2. According to these assessments, the entire license area contains 620 million metric tonnes of nodules (dry weight) with 195 million t of Mn, 8.7 million t of Ni, 7.3 million t of Cu, 1.6 million t of Ti, and 1.1 million t of Co. Results of geostatistical analyses show that the mean nodule abundance in the four prospective areas in the eastern license area (“sub-clusters”) varies between 17.7 and 23.1 kg/m2 and the resources here amount to 14 million metric tonnes of nodules in total (dry weight), containing 4.4 million t of Mn, 198,000 t of Ni, 165,000 t of Cu, 36,000 t of Ti, and 24,000 t of Co.</para>
## </abstract>
## <keywordSet>
## <keyword>Occurrence</keyword>
## <keywordThesaurus>GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type.xml</keywordThesaurus>
## </keywordSet>
## <intellectualRights>
## <para>This work is licensed under a <ulink url="http://creativecommons.org/licenses/by/4.0/legalcode"><citetitle>Creative Commons Attribution (CC-BY) 4.0 License</citetitle></ulink>.</para>
## </intellectualRights>
## <maintenance>
## <description>
## <para></para>
## </description>
## <maintenanceUpdateFrequency>unkown</maintenanceUpdateFrequency>
## </maintenance>
## <creator>
## <individualName>
## <givenName></givenName>
## <surName></surName>
## </individualName>
## <organizationName>Federal Institute for Geosciences and Natural Resources of Germany</organizationName>
## <positionName></positionName>
## <electronicMailAddress></electronicMailAddress>
## </creator>
## <metadataProvider>
## <individualName>
## <givenName>Sheldon</givenName>
## <surName>Carter</surName>
## </individualName>
## <organizationName>International Seabed Authority</organizationName>
## <positionName>Database Manager</positionName>
## <electronicMailAddress>scarter@isa.org.jm</electronicMailAddress>
## </metadataProvider>
## <contact>
## <individualName>
## <givenName>Sheldon</givenName>
## <surName>Carter</surName>
## </individualName>
## <organizationName>International Seabed Authority</organizationName>
## <positionName>Database Manager</positionName>
## <electronicMailAddress>scarter@isa.org.jm</electronicMailAddress>
## </contact>
## </dataset>
## <additionalMetadata>
## <metadata>
## <gbif>
## <dateStamp>2021-06-05T14:02:10+0200</dateStamp>
## <hierarchyLevel>dataset</hierarchyLevel>
## <citation>Federal Institute for Geosciences and Natural Resources of Germany, (2015). BGRPMN12015 BIODIVERSITY. Available : DeepData, International Seabed Authority https://data.isa.org.jm/ Accessed: [YYYY-MM-DD].</citation>
## </gbif>
## </metadata>
## </additionalMetadata>
## </eml:eml>
Generating an archive descriptor file
The archive also needs to include a meta.xml file which describes the files in the archive and their relationships.
Let’s first get a list of terms including their qualName.
library(xml2)
get_terms <- function(url) {
doc <- read_xml(url)
terms <- doc %>%
xml_ns_strip() %>%
xml_find_all(".//property") %>%
map_df(function(x) {
list(
name = xml_attr(x, "name"),
qual = xml_attr(x, "qualName")
)
})
}
occurrence_terms <- get_terms("https://rs.gbif.org/core/dwc_occurrence_2020-07-15.xml")
mof_terms <- get_terms("https://rs.gbif.org/extension/obis/extended_measurement_or_fact.xml")Using these we can generate a list of terms to go into the meta.xml file for each table.
generate_meta <- function(occ, mof) {
occurrence_fields <- tibble(name = names(occ)) %>%
left_join(occurrence_terms, by = "name") %>%
mutate(index = as.numeric(row.names(.)) - 1)
occurrence_lines <- paste0("<field index=\"", occurrence_fields$index, "\" term=\"", occurrence_fields$qual, "\"/>")
occurrence_lines[1] <- "<id index=\"0\" />"
occurrence_lines <- paste0(occurrence_lines, collapse = "\n")
mof_fields <- tibble(name = names(mof)) %>%
left_join(mof_terms, by = "name") %>%
mutate(index = as.numeric(row.names(.)) - 1)
mof_lines <- paste0("<field index=\"", mof_fields$index, "\" term=\"", mof_fields$qual, "\"/>")
mof_lines[1] <- "<coreid index=\"0\" />"
mof_lines <- paste0(mof_lines, collapse = "\n")
meta <- read_file("templates/meta.xml")
glue(meta)
}
generate_meta(occ, mof)## <archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="eml.xml">
## <core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
## <files>
## <location>occurrence.txt</location>
## </files>
## <id index="0" />
## <field index="1" term="NA"/>
## <field index="2" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
## <field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
## <field index="4" term="http://rs.tdwg.org/dwc/terms/recordedBy"/>
## <field index="5" term="http://rs.tdwg.org/dwc/terms/individualCount"/>
## <field index="6" term="http://rs.tdwg.org/dwc/terms/organismQuantity"/>
## <field index="7" term="http://rs.tdwg.org/dwc/terms/organismQuantityType"/>
## <field index="8" term="http://rs.tdwg.org/dwc/terms/sex"/>
## <field index="9" term="http://rs.tdwg.org/dwc/terms/occurrenceStatus"/>
## <field index="10" term="http://rs.tdwg.org/dwc/terms/associatedSequences"/>
## <field index="11" term="http://rs.tdwg.org/dwc/terms/occurrenceRemarks"/>
## <field index="12" term="http://rs.tdwg.org/dwc/terms/eventID"/>
## <field index="13" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
## <field index="14" term="http://rs.tdwg.org/dwc/terms/eventTime"/>
## <field index="15" term="http://rs.tdwg.org/dwc/terms/year"/>
## <field index="16" term="http://rs.tdwg.org/dwc/terms/month"/>
## <field index="17" term="http://rs.tdwg.org/dwc/terms/day"/>
## <field index="18" term="http://rs.tdwg.org/dwc/terms/habitat"/>
## <field index="19" term="http://rs.tdwg.org/dwc/terms/samplingProtocol"/>
## <field index="20" term="http://rs.tdwg.org/dwc/terms/eventRemarks"/>
## <field index="21" term="http://rs.tdwg.org/dwc/terms/locationID"/>
## <field index="22" term="http://rs.tdwg.org/dwc/terms/minimumDepthInMeters"/>
## <field index="23" term="http://rs.tdwg.org/dwc/terms/maximumDepthInMeters"/>
## <field index="24" term="http://rs.tdwg.org/dwc/terms/verbatimDepth"/>
## <field index="25" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
## <field index="26" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
## <field index="27" term="http://rs.tdwg.org/dwc/terms/verbatimCoordinateSystem"/>
## <field index="28" term="http://rs.tdwg.org/dwc/terms/verbatimSRS"/>
## <field index="29" term="http://rs.tdwg.org/dwc/terms/coordinateUncertaintyInMeters"/>
## <field index="30" term="http://rs.tdwg.org/dwc/terms/identificationID"/>
## <field index="31" term="http://rs.tdwg.org/dwc/terms/typeStatus"/>
## <field index="32" term="http://rs.tdwg.org/dwc/terms/identifiedBy"/>
## <field index="33" term="http://rs.tdwg.org/dwc/terms/dateIdentified"/>
## <field index="34" term="http://rs.tdwg.org/dwc/terms/identificationVerificationStatus"/>
## <field index="35" term="http://purl.org/dc/terms/type"/>
## <field index="36" term="http://purl.org/dc/terms/license"/>
## <field index="37" term="http://purl.org/dc/terms/rightsHolder"/>
## <field index="38" term="http://purl.org/dc/terms/accessRights"/>
## <field index="39" term="http://purl.org/dc/terms/bibliographicCitation"/>
## <field index="40" term="http://rs.tdwg.org/dwc/terms/institutionID"/>
## <field index="41" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
## <field index="42" term="http://rs.tdwg.org/dwc/terms/taxonID"/>
## <field index="43" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
## <field index="44" term="http://rs.tdwg.org/dwc/terms/scientificNameID"/>
## <field index="45" term="http://rs.tdwg.org/dwc/terms/kingdom"/>
## <field index="46" term="http://rs.tdwg.org/dwc/terms/phylum"/>
## <field index="47" term="http://rs.tdwg.org/dwc/terms/class"/>
## <field index="48" term="http://rs.tdwg.org/dwc/terms/order"/>
## <field index="49" term="http://rs.tdwg.org/dwc/terms/family"/>
## <field index="50" term="http://rs.tdwg.org/dwc/terms/genus"/>
## <field index="51" term="http://rs.tdwg.org/dwc/terms/taxonRank"/>
## <field index="52" term="http://rs.tdwg.org/dwc/terms/taxonomicStatus"/>
## <field index="53" term="http://rs.tdwg.org/dwc/terms/taxonRemarks"/>
## </core>
## <extension encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact">
## <files>
## <location>extendedmeasurementorfact.txt</location>
## </files>
## <coreid index="0" />
## <field index="1" term="NA"/>
## <field index="2" term="http://rs.tdwg.org/dwc/terms/measurementID"/>
## <field index="3" term="http://rs.tdwg.org/dwc/terms/measurementType"/>
## <field index="4" term="http://rs.tdwg.org/dwc/terms/measurementValue"/>
## <field index="5" term="http://rs.tdwg.org/dwc/terms/measurementUnit"/>
## </extension>
## </archive>
Bringing it all together
Now we can generate an archive for each dataset. While I’m generating datasets I’m also populating the RSS feed and creating dataset landing pages.
baseurl <- "https://datasets.obis.org/hosted/isa/"
item_template <- read_file("templates/rss_item.xml")
landing_template <- read_file("templates/index_dataset.html")
items <- list()
pubdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %z")
unlink("output", recursive = TRUE)
dir.create("output")
datasetids <- unique(records$dataset_id)
for (datasetid in datasetids) {
dataset <- records %>%
filter(dataset_id == datasetid) %>%
head(1)
dataset$Metadata$abstract <- dataset$Metadata$abstract %>%
str_replace(., "&", "&") %>%
str_replace(., ">", ">") %>%
str_replace(., "<", "<") %>%
str_replace(., "'", "'") %>%
str_replace(., "\"", """)
title <- dataset$Metadata$title
abstract <- dataset$Metadata$abstract
link <- paste0(baseurl, datasetid, "/index.html")
dwca <- paste0(baseurl, datasetid, "/", datasetid, ".zip")
# clear dataset directory
unlink(paste0("output/", datasetid), recursive = TRUE)
dir.create(paste0("output/", datasetid))
# RSS feed items
item <- glue(item_template)
items[[datasetid]] <- item
# dataset landing page
landing <- glue(landing_template)
writeLines(landing, paste0("output/", datasetid, "/index.html"))
# archive
dataset_occ <- occ %>% filter(dataset_id == datasetid)
dataset_mof <- mof %>% filter(dataset_id == datasetid)
eml <- generate_eml(dataset)
meta <- generate_meta(occ, mof)
write.table(dataset_occ, file = paste0("output/", datasetid, "/occurrence.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
write.table(dataset_mof, file = paste0("output/", datasetid, "/extendedmeasurementorfact.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
writeLines(eml, paste0("output/", datasetid, "/eml.xml"))
writeLines(meta, paste0("output/", datasetid, "/meta.xml"))
files <- c("occurrence.txt", "extendedmeasurementorfact.txt", "eml.xml", "meta.xml")
setwd(paste0("output/", datasetid))
zip(glue("{datasetid}.zip"), files)
for (f in files) {
file.remove(f)
}
setwd("../..")
}Data publishing
In this section all files are uploaded to an S3 bucket. A list of datasets is visible at https://datasets.obis.org/hosted/isa/index.html, and an RSS file is available for the OBIS harvester.
Generate RSS file
Generate landing page
Uploading to S3
library("aws.s3")
delete_object("hosted/isa/", bucket = "obis-datasets")
files <- list.files("output", full.names = TRUE, recursive = TRUE, include.dirs = FALSE)
for (file in files) {
folder <- str_replace(dirname(file), "output", "hosted/isa")
target <- str_replace(file, "output", "hosted/isa")
message(target)
put_object(file, object = target, bucket = "obis-datasets", acl = "public-read")
}